#!/bin/bash

# Required environment variable: $BUILD_ENVIRONMENT
# (This is set by default in the Docker images we build, so you don't
# need to set it yourself.

set -ex -o pipefail

# Suppress ANSI color escape sequences
export TERM=vt100

# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

# Do not change workspace permissions for ROCm and s390x CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
  }
  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
  # shellcheck disable=SC2064
  trap_add cleanup_workspace EXIT
  sudo chown -R jenkins /var/lib/jenkins/workspace
  git config --global --add safe.directory /var/lib/jenkins/workspace
fi


# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
  if [ -n "$NUMBA_CUDA_DIR" ]; then
    NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
    pushd "$NUMBA_CUDA_DIR"
    patch -p4 <"$NUMBA_PATCH"
    popd
  fi
fi

echo "Environment variables:"
env

TORCH_INSTALL_DIR=$(python -c "import site; print(site.getsitepackages()[0])")/torch
TORCH_BIN_DIR="$TORCH_INSTALL_DIR"/bin
TORCH_LIB_DIR="$TORCH_INSTALL_DIR"/lib
TORCH_TEST_DIR="$TORCH_INSTALL_DIR"/test

BUILD_DIR="build"
BUILD_RENAMED_DIR="build_renamed"
BUILD_BIN_DIR="$BUILD_DIR"/bin

#Set Default values for these variables in case they are not set
SHARD_NUMBER="${SHARD_NUMBER:=1}"
NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"

# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1

export VALGRIND=ON
# export TORCH_INDUCTOR_INSTALL_GXX=ON
if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # clang9 appears to miscompile code involving std::optional<c10::SymInt>,
  # such that valgrind complains along these lines:
  #
  # Conditional jump or move depends on uninitialised value(s)
  #    at 0x40303A: ~optional_base (Optional.h:281)
  #    by 0x40303A: call (Dispatcher.h:448)
  #    by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:10)
  #    by 0x403700: main (basic.cpp:16)
  #  Uninitialised value was created by a stack allocation
  #    at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:6)
  #
  # The problem does not appear with gcc or newer versions of clang (we tested
  # clang14).  So we suppress valgrind testing for clang9 specifically.
  # You may need to suppress it for other versions of clang if they still have
  # the bug.
  #
  # A minimal repro for the valgrind error is below:
  #
  # #include <ATen/ATen.h>
  # #include <ATen/core/dispatch/Dispatcher.h>
  #
  # using namespace at;
  #
  # Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
  #   auto op = c10::Dispatcher::singleton()
  #       .findSchemaOrThrow(at::_ops::as_strided::name, at::_ops::as_strided::overload_name)
  #       .typed<at::_ops::as_strided::schema>();
  #   return op.call(self, size, stride, storage_offset);
  # }
  #
  # int main(int argv) {
  #   Tensor b = empty({3, 4});
  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), std::nullopt);
  # }
  export VALGRIND=OFF
fi

detect_cuda_arch

if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
  # There are additional warnings on s390x, maybe due to newer gcc.
  # Skip this check for now
  export VALGRIND=OFF
fi

if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]] || [[ "${CONTINUE_THROUGH_ERROR}" == "1" ]]; then
  # When rerunning disable tests, do not generate core dumps as it could consume
  # the runner disk space when crashed tests are run multiple times. Running out
  # of space is a nasty issue because there is no space left to even download the
  # GHA to clean up the disk
  #
  # We also want to turn off core dump when CONTINUE_THROUGH_ERROR is set as there
  # is a small risk of having multiple core files generated. Arguably, they are not
  # that useful in this case anyway and the test will still continue
  ulimit -c 0

  # Note that by piping the core dump to a script set in /proc/sys/kernel/core_pattern
  # as documented in https://man7.org/linux/man-pages/man5/core.5.html, we could
  # dynamically stop generating more core file when the disk space drops below a
  # certain threshold. However, this is not supported inside Docker container atm
fi

# Get fully qualified path using realpath
if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-"build/custom_test_artifacts"}")
fi

# Reduce set of tests to include when running run_test.py
if [[ -n $TESTS_TO_INCLUDE ]]; then
  echo "Setting INCLUDE_CLAUSE"
  INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
fi

echo "Environment variables"
env

echo "Testing pytorch"

export LANG=C.UTF-8

PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}

if [[ "$TEST_CONFIG" == 'default' ]]; then
  export CUDA_VISIBLE_DEVICES=0
  export HIP_VISIBLE_DEVICES=0
fi

if [[ "$TEST_CONFIG" == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  export HIP_VISIBLE_DEVICES=0,1,2,3
fi

if [[ "$TEST_CONFIG" == 'slow' ]]; then
  export PYTORCH_TEST_WITH_SLOW=1
  export PYTORCH_TEST_SKIP_FAST=1
fi

if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
  export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
  # TODO: slow gradcheck tests run out of memory a lot recently, so setting this
  # to run them sequentially with only one process to mitigate the issue
  export PYTORCH_TEST_CUDA_MEM_LEAK_CHECK=1
fi

if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  # Used so that only cuda/rocm specific versions of tests are generated
  # mainly used so that we're not spending extra cycles testing cpu
  # devices on expensive gpu machines
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
  # setting PYTHON_TEST_EXTRA_OPTION
  export PYTHON_TEST_EXTRA_OPTION="--xpu"
fi

if [[ "$TEST_CONFIG" == *crossref* ]]; then
  export PYTORCH_TEST_WITH_CROSSREF=1
fi

if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  # regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
  export VALGRIND=OFF
  # Print GPU info
  rocminfo
  rocminfo | grep -E 'Name:.*\sgfx|Marketing'

  # for benchmarks/dynamo/check_accuracy.py, we need to put results in a rocm specific directory to avoid clashes with cuda
  MAYBE_ROCM="rocm/"
fi

if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  if [ -f /opt/intel/oneapi/umf/latest/env/vars.sh ]; then
    # shellcheck disable=SC1091
    source /opt/intel/oneapi/umf/latest/env/vars.sh
  fi
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
fi

if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  # JIT C++ extensions require ninja.
  pip_install "ninja==1.10.2"
  # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
  # but this script should be runnable by any user, including root
  export PATH="$HOME/.local/bin:$PATH"
fi

if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  # TODO: revisit this once the CI is stabilized on aarch64 linux
  export VALGRIND=OFF
fi

# DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
# if you're not careful.  Check this if you made some changes and the
# ASAN test is not working
if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=true:strict_init_order=true:detect_odr_violation=1:detect_container_overflow=0:check_initialization_order=true:debug=true
    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
        export ASAN_OPTIONS="${ASAN_OPTIONS}:protect_shadow_gap=0"
    fi
    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
    # TODO: Figure out how to avoid hard-coding these paths
    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-18/bin/llvm-symbolizer
    export TORCH_USE_RTLD_GLOBAL=1
    # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
    # default behavior.
    #
    # The reason for this is that without RTLD_GLOBAL, if we load multiple
    # libraries that depend on libtorch (as is the case with C++ extensions), we
    # will get multiple copies of libtorch in our address space.  When UBSAN is
    # turned on, it will do a bunch of virtual pointer consistency checks which
    # won't work correctly.  When this happens, you get a violation like:
    #
    #    member call on address XXXXXX which does not point to an object of
    #    type 'std::_Sp_counted_base<__gnu_cxx::_Lock_policy::_S_atomic>'
    #    XXXXXX note: object is of type
    #    'std::_Sp_counted_ptr<torch::nn::LinearImpl*, (__gnu_cxx::_Lock_policy)2>'
    #
    # (NB: the textual types of the objects here are misleading, because
    # they actually line up; it just so happens that there's two copies
    # of the type info floating around in the address space, so they
    # don't pointer compare equal.  See also
    #   https://github.com/google/sanitizers/issues/1175
    #
    # UBSAN is kind of right here: if we relied on RTTI across C++ extension
    # modules they would indeed do the wrong thing;  but in our codebase, we
    # don't use RTTI (because it doesn't work in mobile).  To appease
    # UBSAN, however, it's better if we ensure all the copies agree!
    #
    # By the way, an earlier version of this code attempted to load
    # libtorch_python.so with LD_PRELOAD, which has a similar effect of causing
    # it to be loaded globally.  This isn't really a good idea though, because
    # it depends on a ton of dynamic libraries that most programs aren't gonna
    # have, and it applies to child processes.

    LD_PRELOAD=$(clang --print-file-name=libclang_rt.asan-x86_64.so)
    export LD_PRELOAD
    # Disable valgrind for asan
    export VALGRIND=OFF

    (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
    echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_asan(3)")
    #(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_ubsan(0)")
    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_vptr_ubsan()")
    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
fi

# The torch._C._crash_if_debug_asserts_fail() function should only fail if both of the following are true:
# 1. The build is in debug mode
# 2. The value 424242 is passed in
# This tests that the debug asserts are working correctly.
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
    (cd test && python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
fi

if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
  export ATEN_CPU_CAPABILITY=default
elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
fi

if [[ "${TEST_CONFIG}" == "legacy_nvidia_driver" ]]; then
  # Make sure that CUDA can be initialized
  (cd test && python -c "import torch; torch.rand(2, 2, device='cuda')")
  export USE_LEGACY_DRIVER=1
fi

test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
}

test_python_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
  fi

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
}

test_python() {
  # shellcheck disable=SC2086
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
}

test_python_smoke() {
  # Smoke tests for H100/B200
  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
}

test_python_smoke_b200() {
  # Targeted smoke tests for B200 including FlashAttention CuTe coverage
  install_flash_attn_cute
  time python test/run_test.py \
    --include \
      test_matmul_cuda \
      test_scaled_matmul_cuda \
      inductor/test_fp8 \
      nn/attention/test_fa4 \
      nn/attention/test_open_registry \
      inductor/test_flex_flash \
      inductor/test_torchinductor \
    $PYTHON_TEST_EXTRA_OPTION \
    --upload-artifacts-while-running
  assert_git_not_dirty
}

test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  # This test requires multicast support
  time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
}

test_h100_symm_mem() {
  # symmetric memory test
  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
}

test_h100_cutlass_backend() {
  # cutlass backend tests for H100
  TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
}

test_lazy_tensor_meta_reference_disabled() {
  export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
  echo "Testing lazy tensor operations without meta reference"
  time python test/run_test.py --include lazy/test_ts_opinfo.py --verbose
  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
}

test_dynamo_core() {
  time python test/run_test.py \
    --include-dynamo-core-tests \
    --verbose \
    --upload-artifacts-while-running
  assert_git_not_dirty
}

test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
  fi
  python tools/dynamo/verify_dynamo.py
  # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
  # Instead, use @skipIfTorchDynamo on your tests.
  time python test/run_test.py --dynamo \
    --exclude-inductor-tests \
    --exclude-jit-executor \
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
  assert_git_not_dirty
}

test_einops() {
  pip install einops==0.6.1
  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
  pip install einops==0.7.0
  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
  pip install einops==0.8.1
  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
  assert_git_not_dirty
}


test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_on_gpu_device1 --verbose
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_gpu_device --verbose
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_load_package_multiple_gpus --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/tensor/test_dtensor_compile.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
}

test_inductor_core() {
  time python test/run_test.py \
    --include-inductor-core-tests \
    --exclude inductor/test_benchmark_fusion \
              inductor/test_cutlass_backend \
              inductor/test_flex_attention \
              inductor/test_max_autotune \
              inductor/test_aot_inductor_arrayref \
              inductor/test_aot_inductor_arrayref \
              inductor/test_compiled_autograd \
              inductor/test_compile_subprocess \
              inductor/test_cpu_cpp_wrapper \
              inductor/test_cpu_repro \
              inductor/test_cpu_select_algorithm \
              inductor/test_torchinductor_dynamic_shapes \
              inductor/test_torchinductor \
              inductor/test_mkldnn_pattern_matcher \
              inductor/test_torchinductor_codegen_dynamic_shapes \
    --verbose \
    --upload-artifacts-while-running
  assert_git_not_dirty
}

test_inductor_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
  fi

  python tools/dynamo/verify_dynamo.py
  python test/run_test.py --inductor \
    --include test_modules test_ops test_ops_gradients test_torch \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose

  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
  python test/run_test.py \
    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
}

test_inductor_aoti_cpp() {
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi

  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}

test_inductor_aoti_cross_compile_for_windows() {

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  # Set WINDOWS_CUDA_HOME environment variable
  WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
  export WINDOWS_CUDA_HOME

  echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
  echo "Contents:"
  ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true

  python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
}

test_inductor_cpp_wrapper_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
  fi

  export TORCHINDUCTOR_CPP_WRAPPER=1
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  # Run certain inductor unit tests with cpp wrapper. In the end state, we
  # should be able to run all the inductor unit tests with cpp_wrapper.
  #
  # TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
  # but change that in another PR to more accurately monitor the increased CI
  # usage.
  python test/run_test.py \
    --include inductor/test_torchinductor_opinfo \
    -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
  python test/run_test.py \
    --include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
  python test/run_test.py --inductor \
    --include test_torch \
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose

  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
    python test/run_test.py \
      --include inductor/test_mkldnn_pattern_matcher \
      -k 'xpu' \
      --shard "$1" "$NUM_TEST_SHARDS" \
      --verbose
  fi
}

# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
# For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
# the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
# The matrix of test options is specified in .github/workflows/inductor.yml,
# .github/workflows/inductor-periodic.yml, and
# .github/workflows/inductor-perf-test-nightly.yml
DYNAMO_BENCHMARK_FLAGS=()

pr_time_benchmarks() {

  pip_install "fbscribelogger"

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "$TEST_REPORTS_DIR/new_expected_results.csv"
}

if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
  pr_time_benchmarks
  exit 0
elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor --inductor-compile-mode max-autotune)
elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor)
fi

if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
fi

if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
fi

test_cachebench() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  local BENCHMARK
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    local BENCHMARK=torchbench
  elif [[ "${SHARD_NUMBER}" == 2 ]]; then
    local BENCHMARK=huggingface
  else
    echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
    exit 1
  fi

  local mode_options=("training" "inference")

  for mode in "${mode_options[@]}"; do
    $TASKSET python "benchmarks/dynamo/cachebench.py" \
        --mode "$mode" \
        --device cuda \
        --benchmark "$BENCHMARK" \
        --repeat 3 \
        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"

    $TASKSET python "benchmarks/dynamo/cachebench.py" \
        --mode "$mode" \
        --dynamic \
        --device cuda \
        --benchmark "$BENCHMARK" \
        --repeat 3 \
        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
  done
}

test_verify_cachebench() {
  TMP_TEST_REPORTS_DIR=$(mktemp -d)
  TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"

  $TASKSET python "benchmarks/dynamo/cachebench.py" \
      --mode training \
      --device cpu \
      --model nanogpt \
      --benchmark torchbench \
      --output "$TEST_OUTPUT"

  # -s checks file exists and is non empty
  if [[ ! -s "$TEST_OUTPUT" ]]; then
    echo "Cachebench failed to produce an output."
    echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
    exit 1
  fi
}

test_perf_for_dashboard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  local suite="$1"
  shift

  local backend=inductor
  local modes=()
  if [[ "$DASHBOARD_TAG" == *training-true* ]]; then
    modes+=(training)
  fi
  if [[ "$DASHBOARD_TAG" == *inference-true* ]]; then
    modes+=(inference)
  fi
  # TODO: All the accuracy tests can be skipped once the CI accuracy checking is stable enough
  local targets=(accuracy performance)

  local device=cuda
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    if [[ "${TEST_CONFIG}" == *cpu_x86_zen* ]]; then
      device=cpu_x86_zen
    elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
      device=cpu_x86
    elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
      device=cpu_aarch64
    fi
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
    device=cuda_h100
  elif [[ "${TEST_CONFIG}" == *b200* ]]; then
    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
    device=xpu
  fi

  for mode in "${modes[@]}"; do
    if [[ "$mode" == "inference" ]]; then
      if [[ "$device" == "cpu_x86" ]]; then
        dtype=amp
      else
        dtype=bfloat16
      fi
    elif [[ "$mode" == "training" ]]; then
      dtype=amp
    fi
    for target in "${targets[@]}"; do
      local target_flag=("--${target}")
      if [[ "$target" == "performance" ]]; then
        target_flag+=( --cold-start-latency)
      elif [[ "$target" == "accuracy" ]]; then
        target_flag+=( --no-translation-validation)
      fi

      if [[ "$DASHBOARD_TAG" == *freezing-true* ]]; then
        target_flag+=( --freezing)
      fi

      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
            --dynamic-batch-only "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
        if [[ "$target" == "accuracy" ]]; then
          # Also collect Export pass rate and display as a separate row
          $TASKSET python "benchmarks/dynamo/$suite.py" \
              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
        fi
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
    done
  done
}

test_single_dynamo_benchmark() {
  # Usage: test_single_dynamo_benchmark inductor_inference huggingface 0 --args-for-script

  # Use test-reports directory under test folder will allow the CI to automatically pick up
  # the test reports and upload them to S3. Need to use full path here otherwise the script
  # will bark about file not found later on
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  local name="$1"
  shift
  local suite="$1"
  shift
  # shard id is mandatory, even if it is not passed
  local shard_id="$1"
  shift

  local partition_flags=()
  if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
    partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
  fi

  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    python "benchmarks/dynamo/$suite.py" \
      --ci --performance --disable-cudagraphs --inductor \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}" \
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx2/}
    fi
    if [[ "${TEST_CONFIG}" == *_avx512* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx512/}
    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain --print-compilation-time \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
      "$@" "${partition_flags[@]}" \
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}${TEST_CONFIG}_${name}.csv"
  fi
}

test_inductor_micro_benchmark() {
  # torchao requires cuda 8.0 or above for bfloat16 support
  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;8.6"
  fi
  install_torchao
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    test_inductor_set_cpu_affinity
  fi
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
}

test_inductor_halide() {
  python test/run_test.py --include inductor/test_halide.py --verbose
  assert_git_not_dirty
}

test_inductor_pallas() {
  python test/run_test.py --include inductor/test_pallas.py --verbose
  assert_git_not_dirty
}

test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
}

test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports

  local suite="$1"
  shift
  local shard_id="$1"
  shift

  # Exclude torchrec_dlrm for CUDA 13 as FBGEMM is not compatible
  local extra_args=()
  if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
    extra_args=(--exclude-exact torchrec_dlrm)
  fi

  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    # TODO (huydhn): Just smoke test some sample models
    if [[ "${TEST_CONFIG}" == *b200* ]]; then
      if [[ "${suite}" == "huggingface" ]]; then
        export TORCHBENCH_ONLY_MODELS="DistillGPT2"
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "${extra_args[@]}" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
      local dt="float32"
      if [[ "${TEST_CONFIG}" == *amp* ]]; then
        dt="amp"
      fi
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "${extra_args[@]}" "$@"
      else
        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "${extra_args[@]}" "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    else
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
    fi
  fi
}

test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

  # Check memory compression ratio for a few models
  for test in BERT_pytorch yolov3; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
    cat "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
    python benchmarks/dynamo/check_memory_compression_ratio.py --actual \
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done

  # Perform some "warm-start" runs for a few huggingface models.
  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_huggingface_training.csv"
  done
}

test_inductor_set_cpu_affinity(){
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"

  if [[ "$(uname -m)" != "aarch64" ]]; then
    # Use Intel OpenMP for x86
    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
    export KMP_AFFINITY=granularity=fine,compact,1,0
    export KMP_BLOCKTIME=1
  fi

  # Use nproc here instead of lscpu because it takes into account cgroups slice
  cpus=$(nproc)
  thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
  cores=$((cpus / thread_per_core))

  # Set number of cores to 16 on aarch64 for performance runs
  if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
    cores=16
  fi
  export OMP_NUM_THREADS=$cores

  # Handle cgroups slice start and end CPU
  start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
  # Leaving one physical CPU for other tasks
  end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
  export TASKSET="taskset -c $start_cpu-$end_cpu"
}

test_inductor_torchbench_cpu_smoketest_perf(){
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  test_inductor_set_cpu_affinity
  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv

  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
  do
    local model_name=${model_cfg[0]}
    local data_type=${model_cfg[2]}
    local speedup_target=${model_cfg[5]}
    local backend=${model_cfg[1]}
    if [[ ${model_cfg[4]} == "cpp" ]]; then
      export TORCHINDUCTOR_CPP_WRAPPER=1
    else
      unset TORCHINDUCTOR_CPP_WRAPPER
    fi
    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"

    if [[ ${model_cfg[3]} == "dynamic" ]]; then
      $TASKSET python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
        --dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
    else
      $TASKSET python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
        --freezing --timeout 9000 --"$backend" --output "$output_name"
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
    # Allow 1% variance for CPU perf to accommodate perf fluctuation
    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target" -s 0.99
  done
}

test_torchbench_gcp_smoketest(){
  pushd "${TORCHBENCHPATH}"
  python test.py -v
  popd
}

test_aten() {
  # Test ATen
  # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
  # scalar_tensor_test, basic, native_test
  echo "Running ATen tests with pytorch lib"

  if [[ -n "$IN_WHEEL_TEST" ]]; then
    echo "Running test with the install folder"
    # Rename the build folder when running test to ensure it
    # is not depended on the folder
    mv "$BUILD_DIR" "$BUILD_RENAMED_DIR"
    TEST_BASE_DIR="$TORCH_TEST_DIR"
  else
    echo "Running test with the build folder"
    TEST_BASE_DIR="$BUILD_BIN_DIR"
  fi

  # NB: the ATen test binaries don't have RPATH set, so it's necessary to
  # put the dynamic libraries somewhere were the dynamic linker can find them.
  # This is a bit of a hack.
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libc10* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libcaffe2* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"

  if [[ -n "$IN_WHEEL_TEST" ]]; then
    # Restore the build folder to avoid any impact on other tests
    mv "$BUILD_RENAMED_DIR" "$BUILD_DIR"
  fi

  assert_git_not_dirty
}

test_without_numpy() {
  pushd "$(dirname "${BASH_SOURCE[0]}")"
  python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())"
  # Regression test for https://github.com/pytorch/pytorch/issues/66353
  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
  # Regression test for https://github.com/pytorch/pytorch/issues/109387
  if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
    python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
  fi
  # Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
  popd
}

test_libtorch() {
  local SHARD="$1"

  # The slow test config corresponds to a default test config that should run
  # the libtorch tests instead.
  if [[ "$TEST_CONFIG" != "slow" ]]; then
    echo "Testing libtorch"
    ln -sf "$TORCH_LIB_DIR"/libbackend_with_compiler.so "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libjitbackend_test.so "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libcaffe2_nvrtc.so "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"

    if [[ -z "${SHARD}" || "${SHARD}" == "1" ]]; then
      test_libtorch_api
    fi

    if [[ -z "${SHARD}" || "${SHARD}" == "2" ]]; then
      test_libtorch_jit
    fi

    assert_git_not_dirty
  fi
}

test_libtorch_jit() {
  # Prepare the model used by test_jit, the model needs to be in the test directory
  # to get picked up by run_test
  pushd test
  python cpp/jit/tests_setup.py setup
  popd

  # Run jit and lazy tensor cpp tests together to finish them faster
  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
  else
    # CUDA tests have already been skipped when CUDA is not available
    python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
  fi

  # Cleaning up test artifacts in the test folder
  pushd test
  python cpp/jit/tests_setup.py shutdown
  popd
}

test_libtorch_api() {
  # Start background download
  MNIST_DIR="${PWD}/test/cpp/api/mnist"
  python tools/download_mnist.py --quiet -d "${MNIST_DIR}"

  if [[ "$BUILD_ENVIRONMENT" == *asan* || "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
    TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
    mkdir -p $TEST_REPORTS_DIR

    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"

  fi

  # quantization is not fully supported on s390x yet
  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* && "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
    # NB: This test is not under TORCH_BIN_DIR but under BUILD_BIN_DIR
    export CPP_TESTS_DIR="${BUILD_BIN_DIR}"
    python test/run_test.py --cpp --verbose -i cpp/static_runtime_test
  fi
}

test_xpu_bin(){
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

  for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*; do
    if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
      case_name=$(basename "$xpu_case")
      echo "Testing ${case_name} ..."
      "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
    fi
  done
}

test_aot_compilation() {
  echo "Testing Ahead of Time compilation"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"

  if [ -f "$TORCH_BIN_DIR"/test_mobile_nnc ]; then
    CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_mobile_nnc
  fi

  if [ -f "$TORCH_BIN_DIR"/aot_model_compiler_test ]; then
    source test/mobile/nnc/test_aot_compile.sh
  fi
}

test_vulkan() {
  if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_TEST_DIR"
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_TEST_DIR"
    export VK_ICD_FILENAMES=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/vk_swiftshader_icd.json
    CPP_TESTS_DIR="${TORCH_TEST_DIR}" LD_LIBRARY_PATH=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/ python test/run_test.py --cpp --verbose -i cpp/vulkan_api_test
  fi
}

test_distributed() {
  echo "Testing distributed python tests"
  # shellcheck disable=SC2086
  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" $INCLUDE_CLAUSE --verbose
  assert_git_not_dirty

  if [[ ("$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm*) && "$SHARD_NUMBER" == 1 ]]; then
    echo "Testing distributed C++ tests"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
    # These are distributed tests, so let's continue running them sequentially here to avoid
    # any surprise
    python test/run_test.py --cpp --verbose -i cpp/FileStoreTest
    python test/run_test.py --cpp --verbose -i cpp/HashStoreTest
    python test/run_test.py --cpp --verbose -i cpp/TCPStoreTest

    echo "Testing multi-GPU linalg tests"
    python test/run_test.py -i test_linalg.py -k test_matmul_offline_mgpu_tunable --verbose

    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
      MPIEXEC=$(command -v mpiexec)
      if [[ -n "$MPIEXEC" ]]; then
        # NB: mpiexec only works directly with the C++ test binary here
        MPICMD="${MPIEXEC} -np 2 $TORCH_BIN_DIR/ProcessGroupMPITest"
        eval "$MPICMD"
      fi

      python test/run_test.py --cpp --verbose -i cpp/ProcessGroupGlooTest
      python test/run_test.py --cpp --verbose -i cpp/ProcessGroupNCCLTest
      python test/run_test.py --cpp --verbose -i cpp/ProcessGroupNCCLErrorsTest
    fi
  fi
}

test_quantization() {
  echo "Testing quantization"

  python test/test_quantization.py
}

test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
}

test_custom_backend() {
  echo "Testing custom backends"
  CUSTOM_BACKEND_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-backend-build"
  pushd test/custom_backend
  cp -a "$CUSTOM_BACKEND_BUILD" build
  # Run tests Python-side and export a lowered module.
  python test_custom_backend.py -v
  python backend.py --export-module-to=model.pt
  # Run tests C++-side and load the exported lowered module.
  build/test_custom_backend ./model.pt
  rm -f ./model.pt
  popd
  assert_git_not_dirty
}

test_custom_script_ops() {
  echo "Testing custom script operators"

  if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
    echo "Skipping custom script operators until it's fixed"
    return 0
  fi

  CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
  pushd test/custom_operator
  cp -a "$CUSTOM_OP_BUILD" build
  # Run tests Python-side and export a script module.
  python test_custom_ops.py -v
  python model.py --export-script-module=model.pt
  # Run tests C++-side and load the exported script module.
  build/test_custom_ops ./model.pt
  popd
  assert_git_not_dirty
}

test_libtorch_agnostic_targetting() {
    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"

    REPO_DIR=$(pwd)
    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"

    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
    echo "Building 2.9 extension wheel with current PyTorch..."
    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
    time python setup.py bdist_wheel

    # Save the wheel
    mkdir -p "$WHEEL_DIR"
    cp dist/*.whl "$WHEEL_DIR/"
    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
    echo "Built wheel: $(basename "$WHEEL_FILE")"
    popd

    # Create venv and install PyTorch 2.9
    python -m venv venv_pytorch_2_9
    # shellcheck disable=SC1091
    . venv_pytorch_2_9/bin/activate

    # Clear PYTHONPATH to avoid using the development PyTorch
    echo "Clearing PYTHONPATH to use only venv packages..."
    unset PYTHONPATH

    # Upgrade pip to latest version
    echo "Upgrading pip to latest version..."
    pip install --upgrade pip
    pip --version

    echo "Installing PyTorch 2.9..."

    # Install from release channel only
    PYTORCH_VERSION="2.9.0"

    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
        CUDA_MAJOR="${BASH_REMATCH[1]}"
        CUDA_MINOR="${BASH_REMATCH[2]}"
        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
    else
        # Default to CPU build
        CUDA_VERSION="cpu"
        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
    fi

    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
    else
        echo "  FAILED to install PyTorch 2.9.0 from release channel"
        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
        deactivate
        rm -rf venv_pytorch_2_9
        return 1
    fi

    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
    echo "  Installed version: $INSTALLED_VERSION"

    # Install test dependencies
    echo "Installing test dependencies..."
    pip install expecttest numpy unittest-xml-reporting

    # Install the pre-built wheel
    echo ""
    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
    pip install "$WHEEL_FILE"
    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"

    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
    echo ""
    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
        echo ""
        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
    else
        echo "targeting test failed"
        deactivate
        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
        return 1
    fi

    deactivate
    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"

    assert_git_not_dirty
}

test_jit_hooks() {
  echo "Testing jit hooks in cpp"
  HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
  pushd test/jit_hooks
  cp -a "$HOOK_BUILD" build
  # Run tests Python-side and export the script modules with hooks
  python model.py --export-script-module=model
  # Run tests C++-side and load the exported script modules
  build/test_jit_hooks ./model
  popd
  assert_git_not_dirty
}

test_torch_function_benchmark() {
  echo "Testing __torch_function__ benchmarks"
  pushd benchmarks/overrides_benchmark
  python bench.py -n 1 -m 2
  python pyspybench.py Tensor -n 1
  python pyspybench.py SubTensor -n 1
  python pyspybench.py WithTorchFunction -n 1
  python pyspybench.py SubWithTorchFunction -n 1
  popd
  assert_git_not_dirty
}

build_xla() {
  # xla test needs pytorch headers in torch/include
  pushd ..
  python -c "import os, torch, shutil; shutil.copytree(os.path.join(os.path.dirname(torch.__file__), 'include'), 'workspace/torch/include', dirs_exist_ok=True)"
  popd

  # xla test needs sccache setup.
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

  XLA_DIR=xla
  USE_CACHE=1
  clone_pytorch_xla
  # shellcheck disable=SC1091
  source "xla/.circleci/common.sh"

  # TODO: The torch pin #73164 is involved in the sev https://github.com/pytorch/pytorch/issues/86093
  # so this is temporarily removed until XLA fixes the weird logic in https://github.com/pytorch/xla/blob/master/scripts/apply_patches.sh#L17-L18
  rm "${XLA_DIR}/torch_patches/.torch_pin" || true

  apply_patches
  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
  # These functions are defined in .circleci/common.sh in pytorch/xla repo
  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
  assert_git_not_dirty
}

test_xla() {
  # xla test needs sccache setup.
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

  clone_pytorch_xla
  # shellcheck disable=SC1091
  source "./xla/.circleci/common.sh"
  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
  # Set LD_LIBRARY_PATH for C++ tests
  export LD_LIBRARY_PATH="/opt/conda/lib/:${LD_LIBRARY_PATH}"
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SKIP_MP_OP_TESTS=1 run_torch_xla_tests "$(pwd)" "$(pwd)/xla"
  assert_git_not_dirty
}

function check_public_api_test_fails {
    test_name=$1
    invalid_item_name=$2
    invalid_item_desc=$3

    echo "Running public API test '${test_name}'..."
    test_output=$(python test/test_public_bindings.py -k "${test_name}" 2>&1) && ret=$? || ret=$?

    # Ensure test fails correctly.
    if [ "$ret" -eq 0 ]; then
        cat << EOF
Expected the public API test '${test_name}' to fail after introducing
${invalid_item_desc}, but it succeeded! Check test/test_public_bindings.py
for any changes that may have broken the test.
EOF
        return 1
    fi

    # Ensure invalid item is in the test output.
    echo "${test_output}" | grep -q "${invalid_item_name}" && ret=$? || ret=$?

    if [ $ret -ne 0 ]; then
        cat << EOF
Expected the public API test '${test_name}' to identify ${invalid_item_desc}, but
it didn't! It's possible the test may not have run. Check test/test_public_bindings.py
for any changes that may have broken the test.
EOF
        return 1
    fi

    echo "Success! '${test_name}' identified ${invalid_item_desc} ${invalid_item_name}."
    return 0
}

# Do NOT run this test before any other tests, like test_python_shard, etc.
# Because this function uninstalls the torch built from branch and installs
# the torch built on its base commit.
test_forward_backward_compatibility() {
  set -x

  # First, validate public API tests in the torch built from branch.
  # Step 1. Make sure the public API test "test_correct_module_names" fails when a new file
  # introduces an invalid public API function.
  new_filename=$(mktemp XXXXXXXX.py -p "${TORCH_INSTALL_DIR}")

  BAD_PUBLIC_FUNC=$(
  cat << 'EOF'
def new_public_func():
  pass

# valid public API functions have __module__ set correctly
new_public_func.__module__ = None
EOF
  )

  echo "${BAD_PUBLIC_FUNC}" >> "${new_filename}"
  invalid_api="torch.$(basename -s '.py' "${new_filename}").new_public_func"
  echo "Created an invalid public API function ${invalid_api}..."

  check_public_api_test_fails \
      "test_correct_module_names" \
      "${invalid_api}" \
      "an invalid public API function" && ret=$? || ret=$?

  rm -v "${new_filename}"

  if [ "$ret" -ne 0 ]; then
      exit 1
  fi

  # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
  # file is modified to introduce an invalid public API function.
  # The filepath here must not have __all__ defined in it, otherwise the test will pass.
  # If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
  # that does not have __all__ defined.
  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
  invalid_api="torch.cuda.streams.new_public_func"
  echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."

  check_public_api_test_fails \
      "test_correct_module_names" \
      "${invalid_api}" \
      "an invalid public API function" && ret=$? || ret=$?

  mv -v "${EXISTING_FILEPATH}.orig" "${EXISTING_FILEPATH}"

  if [ "$ret" -ne 0 ]; then
      exit 1
  fi

  # Step 3. Make sure that the public API test "test_modules_can_be_imported" fails when a module
  # cannot be imported.
  new_module_dir=$(mktemp XXXXXXXX -d -p "${TORCH_INSTALL_DIR}")
  echo "invalid syntax garbage" > "${new_module_dir}/__init__.py"
  invalid_module_name="torch.$(basename "${new_module_dir}")"

  check_public_api_test_fails \
      "test_modules_can_be_imported" \
      "${invalid_module_name}" \
      "a non-importable module" && ret=$? || ret=$?

  rm -rv "${new_module_dir}"

  if [ "$ret" -ne 0 ]; then
      exit 1
  fi

  # Next, build torch from the merge base.
  REPO_DIR=$(pwd)
  if [[ "${BASE_SHA}" == "${SHA1}" ]]; then
    echo "On trunk, we should compare schemas with torch built from the parent commit"
    SHA_TO_COMPARE=$(git rev-parse "${SHA1}"^)
  else
    echo "On pull, we should compare schemas with torch built from the merge base"
    SHA_TO_COMPARE=$(git merge-base "${SHA1}" "${BASE_SHA}")
  fi
  export SHA_TO_COMPARE

  # create a dummy ts model at this version
  python test/create_dummy_torchscript_model.py /tmp/model_new.pt
  python -m venv venv
  # shellcheck disable=SC1091
  . venv/bin/activate

  # build torch at the base commit to generate a base function schema for comparison
  git reset --hard "${SHA_TO_COMPARE}"
  git submodule sync && git submodule update --init --recursive
  echo "::group::Installing Torch From Base Commit"
  pip3 install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
  python -mpip install base_dist/*.whl
  echo "::endgroup::"

  pushd test/forward_backward_compatibility
  pip show torch
  python dump_all_function_schemas.py --filename nightly_schemas.txt

  git reset --hard "${SHA1}"
  git submodule sync && git submodule update --init --recursive
  # FC: verify new model can be load with old code.
  if ! python ../load_torchscript_model.py /tmp/model_new.pt; then
      echo "FC check failed: new model cannot be load in old code"
      return 1
  fi
  python ../create_dummy_torchscript_model.py /tmp/model_old.pt
  deactivate
  rm -r "${REPO_DIR}/venv" "${REPO_DIR}/base_dist"
  pip show torch
  python check_forward_backward_compatibility.py --existing-schemas nightly_schemas.txt
  # BC: verify old model can be load with new code
  if ! python ../load_torchscript_model.py /tmp/model_old.pt; then
      echo "BC check failed: old model cannot be load in new code"
      return 1
  fi
  popd
  set +x
  assert_git_not_dirty
}

test_bazel() {
  set -e -o pipefail

  # bazel test needs sccache setup.
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

  get_bazel

  if [[ "$CUDA_VERSION" == "cpu" ]]; then
    # Test //c10/... without Google flags and logging libraries. The
    # :all_tests target in the subsequent Bazel invocation tests
    # //c10/... with the Google libraries.
    tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA \
      --no//c10:use_gflags --no//c10:use_glog //c10/...

    tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
  else
    # Increase the test timeout to 480 like CPU tests because modules_test frequently timeout
    tools/bazel test --test_timeout=480 --test_output=errors \
      //:any_test \
      //:autograd_test \
      //:dataloader_test \
      //:dispatch_test \
      //:enum_test \
      //:expanding_array_test \
      //:fft_test \
      //:functional_test \
      //:grad_mode_test \
      //:inference_mode_test \
      //:init_test \
      //:jit_test \
      //:memory_test \
      //:meta_tensor_test \
      //:misc_test \
      //:moduledict_test \
      //:modulelist_test \
      //:modules_test \
      //:namespace_test \
      //:nested_test \
      //:nn_utils_test \
      //:operations_test \
      //:ordered_dict_test \
      //:parallel_benchmark_test \
      //:parameterdict_test \
      //:parameterlist_test \
      //:sequential_test \
      //:serialize_test \
      //:special_test \
      //:static_test \
      //:support_test \
      //:tensor_flatten_test \
      //:tensor_indexing_test \
      //:tensor_options_cuda_test \
      //:tensor_options_test \
      //:tensor_test \
      //:torch_dist_autograd_test \
      //:torch_include_test \
      //:transformer_test \
      //:test_bazel \
      //c10/cuda/test:test \
      //c10/test:core_tests \
      //c10/test:typeid_test \
      //c10/test:util/ssize_test \
      //c10/test:util_base_tests
  fi
}

test_benchmarks() {
  if [[ "$BUILD_ENVIRONMENT" == *cuda* && $TEST_CONFIG != *nogpu* ]]; then
    pip_install "pytest-benchmark==3.2.3"
    pip_install "requests"
    BENCHMARK_DATA="benchmarks/.data"
    mkdir -p ${BENCHMARK_DATA}
    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling
    # TODO: Enable these for GHA once we have credentials for forked pull requests
    if [[ -z "${GITHUB_ACTIONS}" ]]; then
      python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
      python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
      python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json
    fi
    assert_git_not_dirty
  fi
}

test_cpp_extensions() {
  # This is to test whether cpp extension build is compatible with current env. No need to test both ninja and no-ninja build
  time python test/run_test.py --include test_cpp_extensions_aot_ninja --verbose
  assert_git_not_dirty
}

test_vec256() {
  # This is to test vec256 instructions DEFAULT/AVX/AVX2 (platform dependent, some platforms might not support AVX/AVX2)
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
    echo "Testing vec256 instructions"
    mkdir -p test/test-reports/vec256
    pushd build/bin
    vec256_tests=$(find . -maxdepth 1 -executable -name 'vec256_test*')
    for vec256_exec in $vec256_tests
    do
      $vec256_exec --gtest_output=xml:test/test-reports/vec256/"$vec256_exec".xml
    done
    popd
    assert_git_not_dirty
  fi
}

test_docs_test() {
  .ci/pytorch/docs-test.sh
}

test_executorch() {
  echo "Install torchvision and torchaudio"
  install_torchvision
  install_torchaudio

  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"

  pushd /executorch
  "${INSTALL_SCRIPT}" setup_executorch

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
  # shellcheck disable=SC1091
  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh

  echo "Run ExecuTorch regression tests for some models"
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test_model.sh mv3 cmake xnnpack-quantization-delegation ''

  popd

  assert_git_not_dirty
}

test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        test_linalg \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \
       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
       inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
       inductor/test_cpu_select_algorithm \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
}

test_operator_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)
  ARCH=$(uname -m)

  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
  python -m pip install . -v --no-build-isolation

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
      --output-csv "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.json" \

  pip_install pandas
  python check_perf_csv.py \
      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
      --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
}

test_operator_microbenchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)

  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
  python -m pip install . -v --no-build-isolation

  cd "${TEST_DIR}"/benchmarks/operator_benchmark

  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv optimizer; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
      --benchmark-name "PyTorch operator microbenchmark"
  done
}

test_attention_microbenchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)

  # Install attention-gym dependency
  echo "Installing attention-gym..."
  python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
  pip show triton

  cd "${TEST_DIR}"/benchmarks/transformer

  $TASKSET python score_mod.py --config configs/config_basic.yaml \
    --output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
}

test_openreg() {
  python test/run_test.py --openreg --verbose
  assert_git_not_dirty
}

if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
fi
if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
  # Install numpy-2.0.2 and compatible scipy & numba versions
  # Force re-install of pandas to avoid error where pandas checks numpy version from initial install and fails upon import
  TMP_PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)" 2>/dev/null)
  if [ -n "$TMP_PANDAS_VERSION" ]; then
    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0 pandas=="$TMP_PANDAS_VERSION" --force-reinstall
  else
    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
  fi
  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
  test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
  test_libtorch_agnostic_targetting
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
  test_xla
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
    echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
    (cd .ci/lumen_cli && python -m pip install -e .)
    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
  test_quantization
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
elif [[ "$TEST_CONFIG" == distributed ]]; then
  test_distributed
  # Only run RPC C++ tests on the first shard
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
  TEST_MODE="short"

  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    if [[ "${TEST_CONFIG}" == *long* ]]; then
      TEST_MODE="long"
    elif [[ "${TEST_CONFIG}" == *all* ]]; then
      TEST_MODE="all"
    fi

    test_operator_benchmark cpu ${TEST_MODE}

  fi
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
  test_operator_microbenchmark
elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
  test_attention_microbenchmark
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
  test_inductor_pallas
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
  test_inductor_aoti_cross_compile_for_windows
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark huggingface "$id"
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
  install_torchaudio
  install_torchvision
  PYTHONPATH=/torchbench test_cachebench
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
  install_torchaudio
  install_torchvision
  PYTHONPATH=/torchbench test_verify_cachebench
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
    # Skip torchrec/fbgemm for cuda13 as they're not compatible yet
    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* && "${BUILD_ENVIRONMENT}" != *cuda13* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
  fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti_cpp
  fi
elif [[ "${TEST_CONFIG}" == *inductor_core* ]]; then
  test_inductor_core
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
  test_dynamo_core
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_aten
  fi
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  install_torchvision
  test_python_shard "$SHARD_NUMBER"
  test_aten
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_lazy_tensor_meta_reference_disabled
  test_without_numpy
  install_torchvision
  test_python_shard 1
  test_aten
  test_libtorch 1
  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
    test_xpu_bin
  fi
elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_python_shard 2
  test_libtorch 2
  test_aot_compilation
  test_custom_script_ops
  test_custom_backend
  test_torch_function_benchmark
elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then
  # Handle arbitrary number of shards
  install_torchvision
  test_python_shard "$SHARD_NUMBER"
elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
  test_vulkan
elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  test_bazel
elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
  test_aten
  test_xpu_bin
elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
  test_python_smoke_b200
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
  test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
  test_h100_cutlass_backend
elif [[ "${TEST_CONFIG}" == openreg ]]; then
  test_openreg
else
  install_torchvision
  install_monkeytype
  test_python
  test_aten
  test_vec256
  test_libtorch
  test_aot_compilation
  test_custom_script_ops
  test_custom_backend
  test_torch_function_benchmark
  test_benchmarks
fi
